
do "E:/ReplicateBuild/02_code/00_environment/00_set_environment.do"

*Table of Contents
local matchData = 1
local applicationsData = 1
local applicantsData = 1
local vacancyData = 1
local districtAppsData = 1
	local dataCheck = 1

********************************************************************************
* Read in HRMS / NCERDC crosswalk
********************************************************************************

if `matchData' == 1 {
	********************************************************************************
	* Create crosswalk of hrms_app_id and ncerdc_id
	* Identifier: hrms_app_id
	********************************************************************************

	use "$rawdata/hrms/profile_matchlist", clear
	ren (APP_ID CAN_CITY_AD CAN_STATE_AD CAN_ZIP_AD CAN_ZIP4_AD CAN_TCITY_AD CAN_TSTATE_AD CAN_TZIP_AD CAN_TZIP4_AD CAN_CURRELQ_IND CAN_RELLOC_TXT teachid matchtype) ///
		(hrms_app_id can_city can_state can_zip can_zip4 can_tcity can_tstate can_tzip can_tzip4 can_currelq_ind can_relloc_txt ncerdc_id hrms_ncerdc_matchtype)
	
	
	
	if `dataCheck' == 1 {
		
		unique hrms_app_id // uniquely identify observations
		
		unique ncerdc_id if ncerdc_id!=. // not quite unique -- some teachers had multiple accounts

	}
	

	sort hrms_app_id
	
	compress

	save "$basedata/hrms_ncerdc_xwalk", replace
		
}	

********************************************************************************
* Read in HRMS applications to schools
********************************************************************************

if `applicationsData' == 1 {
	********************************************************************************
	* Create dataset of all unique applications.
	* Identifier: hrms_app_id hrms_job_id
	********************************************************************************

	import delimited using "$rawdata/hrms/Shared_Key.txt", clear bindquotes(loose) varnames(1)
	ren (app_id vac_docid) (hrms_app_id hrms_job_id)
	
	if `dataCheck' == 1 {
		
		unique hrms_app_id hrms_job_id // uniquely identify observations
	
		duplicates tag, gen(dupvar)
		assert dupvar==0
		drop dupvar
		
		bys hrms_job_id: egen minCD = min(vac_unit_cd)
		bys hrms_job_id: egen maxCD = max(vac_unit_cd)
		tab hrms_job_id if minCD!=maxCD
		tab vac_unit_cd if minCD!=maxCD
		drop minCD maxCD

	}
	
	bys hrms_job_id: egen minCD = min(vac_unit_cd)
	bys hrms_job_id: egen maxCD = max(vac_unit_cd)
	gen vac_multiCD = minCD!=maxCD
	drop minCD maxCD
	
	ren vac_unit_cd vac_lea_id 
	sort hrms_app_id hrms_job_id
	
	compress

	save "$basedata/hrms_school_apps", replace
		
}	

********************************************************************************
* Read in HRMS applicants
********************************************************************************

if `applicantsData' == 1 {
	********************************************************************************
	* Create dataset of all unique applicants.
	* Identifier: hrms_app_id
	********************************************************************************

	import delimited using "$rawdata/hrms/application.txt", clear bindquotes(loose) varnames(1)

	* shift variables over when quotation marks messed up the import
	
	* set 1
	gen shift = (v31!="") // single obs
	replace app_reasonq_txt = app_reasonq_txt + "," + app_timestamp + "," + app_location_pref_txt if shift==1
	replace app_timestamp = "" if shift==1
	replace app_location_pref_txt = "" if shift==1
	forv vv=15/22 {
		replace app_reasonq_txt = app_reasonq_txt + "," + v`vv' if shift==1
		replace v`vv' = "" if shift==1
 	}
	assert v21=="" & v22==""
	drop v21 v22
	replace app_reasonq_txt = app_reasonq_txt + ", Washington." if shift==1
	replace app_timestamp = "2014-04-30-17.11.02.745542" if shift==1
	replace app_location_pref_txt = "I would like to substitue again. I like Aces also," if shift==1
	replace v23 = "" if shift==1
	replace v24 = "" if shift==1
	assert v23=="" & v24==""
	drop v23 v24
	forv vv=25/31 {
		replace app_location_pref_txt = app_location_pref_txt + "," + v`vv' if shift==1
		replace v`vv' = "" if shift==1
		assert v`vv' == ""
		drop v`vv'
 	}
	drop shift
	
	* set 2
	gen shift = (v20!="") // single obs
	replace app_reasonq_txt = app_reasonq_txt + "," + app_timestamp + ", I rather be in the City than in the County just for that reasons." if shift==1
	replace app_timestamp = "" if shift==1
	replace app_location_pref_txt = "" if shift==1
	replace app_timestamp = "2015-06-15-13.36.22.437031" if shift==1
	replace app_location_pref_txt = "I " if shift==1
	forv vv=15/20 {
		replace app_location_pref_txt = app_location_pref_txt + "," + v`vv' if shift==1
		replace v`vv' = "" if shift==1
 	}
	assert v20==""
	drop v20 shift
	
	* search out timestamps
	
	gen missing_timestamp = substr(app_timestamp,1,2)!="20" | substr(app_timestamp,5,1)!="-" | substr(app_timestamp,6,1)=="2"
	gen timestamp_temp = ""
	forv vv=15/19 {
		gen tpos = strpos(v`vv',"20")
		gen dashpos = substr(v`vv',tpos+4,1) if substr(v`vv',tpos+5,1)!="2"
		replace timestamp_temp = substr(v`vv',tpos,26) if tpos>0 & dashpos=="-" & timestamp_temp=="" & missing_timestamp==1
		drop tpos dashpos
	}
	gen tpos = strpos(app_location_pref_txt,"20")
	gen dashpos = substr(app_location_pref_txt,tpos+4,1) if substr(app_location_pref_txt,tpos+5,1)!="2"
	replace timestamp_temp = substr(app_location_pref_txt,tpos,26) if tpos>0 & dashpos=="-" & timestamp_temp=="" & missing_timestamp==1
	drop tpos dashpos
	
	gen tpos = strpos(app_timestamp,"20")
	gen dashpos = substr(app_timestamp,tpos+4,1) if substr(app_timestamp,tpos+5,1)!="2"
	replace timestamp_temp = substr(app_timestamp,tpos,26) if tpos>0 & dashpos=="-" & timestamp_temp=="" & missing_timestamp==1
	drop tpos dashpos
	
	gen tpos = strpos(app_reasonq_txt,"20")
	gen dashpos = substr(app_reasonq_txt,tpos+4,1) if substr(app_reasonq_txt,tpos+5,1)!="2"
	replace timestamp_temp = substr(app_reasonq_txt,tpos,26) if tpos>0 & dashpos=="-" & timestamp_temp=="" & missing_timestamp==1
	drop tpos dashpos
	
	* remaining issues are timestamps after "20" used elsewhere // 7 of these
	replace timestamp_temp = "2004-03-14-21.18.42.486526" if strpos(app_reasonq_txt,"2004-03-14-21.18.42.486526")>0
	replace timestamp_temp = "2016-04-25-15.27.59.020070" if strpos(app_comment_txt,"2016-04-25-15.27.59.020070")>0
	replace timestamp_temp = "2009-12-09-13.49.47.652833" if strpos(app_reasonq_txt,"2009-12-09-13.49.47.652833")>0
	replace timestamp_temp = "2013-04-10-01.45.19.369360" if strpos(app_reasonq_txt,"2013-04-10-01.45.19.369360")>0
	replace timestamp_temp = "2011-12-29-20.26.12.782709" if strpos(app_reasonq_txt,"2011-12-29-20.26.12.782709")>0
	replace timestamp_temp = "2014-01-29-11.59.39.122420" if strpos(app_reasonq_txt,"2014-01-29-11.59.39.122420")>0
	replace timestamp_temp = "2013-08-27-10.36.38.808200" if strpos(app_location_pref_txt,"2013-08-27-10.36.38.808200")>0	
	assert timestamp_temp!="" if missing_timestamp==1
	replace app_timestamp = timestamp_temp if missing_timestamp==1
	drop missing_timestamp timestamp_temp
	
	drop v15 v16 v17 v18 v19 // bring these back if we want to parse text

	ren (app_id) (hrms_app_id)

	
	foreach var in "app_avail_dte" "app_mod_dte" "app_create_dte" "app_timestamp" {
		ren `var' `var'_str
		replace `var'_str = "" if `var'_str=="."
		gen yy = substr(`var'_str,1,4)
		gen mm = substr(`var'_str,6,2)
		gen dd = substr(`var'_str,9,2)
		destring yy mm dd, replace
		gen `var' = mdy(mm,dd,yy)
		assert `var'_str == "" if `var'==.
		drop `var'_str yy mm dd
		format `var' %d
	}
	
	if `dataCheck' == 1 {
		unique hrms_app_id // this is the unique identifer

	}
	sort hrms_app_id
	
	
	merge 1:1 hrms_app_id using "$basedata/hrms_ncerdc_xwalk", keepusing(ncerdc_id)
	assert _m!=2 // 99.92 match
	drop _m
	sort hrms_app_id
	
	compress
	
	
	save "$basedata/hrms_applicants", replace
}	


********************************************************************************
* Read in HRMS vacancies
********************************************************************************

if `vacancyData' == 1 {
	********************************************************************************
	* Create dataset of all unique vacancies
	* Identifier: hrms_job_id vac_school_id vac_lea_id
	********************************************************************************


	import delimited using "$rawdata/hrms/app_vacancy.txt", clear bindquotes(loose)
	
	tostring v30 v31 v34, replace // need to put these as strings to fix shifted rows
	
	* shift variables over when quotation marks messed up the import
	
	* set 1
	gen shift = (v38!="")
	replace v18 = v18 + " " + v19 + " " + v20 if shift==1
	forv vv=19/36 {
		local vplus2 = `vv'+2
		replace v`vv' = v`vplus2' if shift==1
 	}
	replace v37 = "" if shift==1
	replace v38 = "" if shift==1
	assert v38==""
	drop v38 shift
	
	* set 2
	gen shift = (v37!="") 
	replace v12 = v12 + " " + v13 if shift==1
	forv vv=13/36 {
		local vplus1 = `vv'+1
		replace v`vv' = v`vplus1' if shift==1
 	}
	replace v37 = "" if shift==1
	assert v37==""
	drop v37 shift
	
	* set 3
	gen shift = strpos(v19,"endorsement") > 0 
	replace v18 = v18 + " " + v18 if shift==1
	forv vv=19/35 {
		local vplus1 = `vv'+1
		replace v`vv' = v`vplus1' if shift==1
 	}
	replace v36 = "" if shift==1
	drop shift	
	
	* set 4
	gen shift = strpos(v19,"sign") > 0
	replace v18 = v18 + " " + v19 + " " + v20 if shift==1
	replace v19 = "" if shift==1
	replace v20 = "" if shift==1
	drop shift
		
	* set 5
	gen shift = strpos(v14,"interview") > 0
	replace v12 = v12 + " " + v13 + " " + v14 if shift==1 
	forv vv=13/16 {
		local vplus2 = `vv'+2
		replace v`vv' = v`vplus2' if shift==1
 	}
	replace v17 = "" if shift==1
	replace v18 = "" if shift==1
	drop shift
	
	* set 6
	gen shift = strpos(v14,"completion") > 0 // one obs
	replace v12 = v12 + " " + v13 + " " + v14 if shift==1 
	replace v13 = "F" if shift==1 // manual fixes because the import didn't parse
	replace v14 = "0" if shift==1
	replace v15 = "" if shift==1
	replace v16 = "$1,935.51 - $3,087.71" if shift==1
	replace v17 = "312" if shift==1
	replace v18 = "" if shift==1
	replace v19 = "NC" if shift==1
	replace v20 = "90" if shift==1
	replace v21 = "2014-07-18" if shift==1
	replace v22 = "" if shift==1
	replace v23 = "10.00" if shift==1
	replace v24 = "Except Child Teacher Asst" if shift==1
	replace v25 = "080" if shift==1
	replace v26 = "" if shift==1
	replace v27 = "" if shift==1
	replace v28 = "" if shift==1
	replace v29 = "" if shift==1
	replace v30 = "" if shift==1
	replace v31 = "1" if shift==1
	replace v32 = "" if shift==1
	replace v33 = "" if shift==1
	replace v34 = "100" if shift==1
	replace v35 = ".00" if shift==1
	replace v36 = "S" if shift==1
	drop shift
	
	* set 7
	gen shift = strpos(v13,"ESL") > 0 // one obs
	replace v12 = "'<a  href=http://p1cdn2static.sharpschool.com/UserFiles/Servers/Server_3727387/File/H R%20documents/ESL%20Teacher.pdf'> job descripton </a>" if shift==1 
	replace v13 = "F" if shift==1 // manual fixes because the import didn't parse
	replace v14 = "0" if shift==1
	replace v15 = "" if shift==1
	replace v16 = "NC Teacher State Salary Schedule" if shift==1
	replace v17 = "332" if shift==1
	replace v18 = "Must hold a valid NC license in ESL. Please click on the link job description for full details." if shift==1
	replace v19 = "NC" if shift==1
	replace v20 = "90" if shift==1
	replace v21 = "2016-03-07" if shift==1
	replace v22 = "255 Elementary Road" if shift==1
	replace v23 = "10.00" if shift==1
	replace v24 = "Teacher ESL" if shift==1
	replace v25 = "710" if shift==1
	replace v26 = "28457" if shift==1
	replace v27 = "" if shift==1
	replace v28 = "" if shift==1
	replace v29 = "" if shift==1
	replace v30 = "" if shift==1
	replace v31 = "0" if shift==1
	replace v32 = "" if shift==1
	replace v33 = "" if shift==1
	replace v34 = "100.00" if shift==1
	replace v35 = ".00" if shift==1
	replace v36 = "S" if shift==1
	drop shift
	
	* set 8
	gen shift = strpos(v13,"Technology") > 0 // one obs
	replace v12 = "Multimedia & Webpage Design and Career Management; willing to be Yearbook  advisor" if shift==1 
	replace v13 = "F" if shift==1 // manual fixes because the import didn't parse
	replace v14 = "0" if shift==1
	replace v15 = "" if shift==1
	replace v16 = "NC Teacher State Salary Schedule" if shift==1
	replace v17 = "321" if shift==1
	replace v18 = "Must hold a valid NC license in Vocational Business Education. May include  assistant football coaching responsibilities" if shift==1
	replace v19 = "NC" if shift==1
	replace v20 = "0" if shift==1
	replace v21 = "" if shift==1
	replace v22 = "14328 NC Hwy 201" if shift==1
	replace v23 = "10.00" if shift==1
	replace v24 = "Teacher Technology Educ" if shift==1
	replace v25 = "710" if shift==1
	replace v26 = "28457" if shift==1
	replace v27 = "" if shift==1
	replace v28 = "" if shift==1
	replace v29 = "" if shift==1
	replace v30 = "" if shift==1
	replace v31 = "0" if shift==1
	replace v32 = "" if shift==1
	replace v33 = "" if shift==1
	replace v34 = "100.00" if shift==1
	replace v35 = ".00" if shift==1
	replace v36 = "S" if shift==1
	drop shift

	* set 9
	gen shift = strpos(v13,"experience") > 0 // one obs
	replace v12 = v12 + "and/or experience working with students  who have mental health needs.   Behavioral needs are addressed by mental health  professionals in the classroom setting.   Exceptional Children and Elementary  Ed. certification or experience is preferred but not required" if shift==1 
	replace v13 = "F" if shift==1 // manual fixes because the import didn't parse
	replace v14 = "0" if shift==1
	replace v15 = "" if shift==1
	replace v16 = "$0.00  - $0.00" if shift==1
	replace v17 = "309" if shift==1
	replace v18 = "" if shift==1
	replace v19 = "NC" if shift==1
	replace v20 = "90" if shift==1
	replace v21 = "2013-07-30" if shift==1
	replace v22 = "201 Chatham Street" if shift==1
	replace v23 = "10.00" if shift==1
	replace v24 = "Kindergarten-6th Gr." if shift==1
	replace v25 = "160" if shift==1
	replace v26 = "28570" if shift==1
	replace v27 = "" if shift==1
	replace v28 = "" if shift==1
	replace v29 = "" if shift==1
	replace v30 = "" if shift==1
	replace v31 = "1" if shift==1
	replace v32 = "" if shift==1
	replace v33 = "" if shift==1
	replace v34 = "" if shift==1
	replace v35 = "" if shift==1
	replace v36 = "" if shift==1
	drop shift	
	
	* set 10
	gen shift = strpos(v13,"supplement") > 0 // one obs
	replace v10 = "Associate Degree" if shift==1
	replace v11 = "" if shift==1
	replace v12 = "" if shift==1 
	replace v13 = "F" if shift==1 // manual fixes because the import didn't parse
	replace v14 = "1" if shift==1
	replace v15 = "" if shift==1
	replace v16 = "$11.27 - $14.97" if shift==1
	replace v17 = "450" if shift==1
	replace v18 = "This position earns health insurance and retirement benefit as well as partial leave benefits (sick, annual, etc.) and the local supplement." if shift==1
	replace v19 = "NC" if shift==1
	replace v20 = "90" if shift==1
	replace v21 = "2014-12-30" if shift==1
	replace v22 = "305 Overlook Road" if shift==1
	replace v23 = "10.00" if shift==1
	replace v24 = "Exceptional Children Assistant" if shift==1
	replace v25 = "110" if shift==1
	replace v26 = "28803" if shift==1
	replace v27 = "" if shift==1
	replace v28 = "1" if shift==1
	replace v29 = "0" if shift==1
	replace v30 = "1" if shift==1
	replace v31 = "0" if shift==1
	replace v32 = "2014-08-22" if shift==1
	replace v33 = "" if shift==1
	replace v34 = ".00" if shift==1
	replace v35 = "35.00" if shift==1
	replace v36 = "H" if shift==1
	drop shift	
	
	* set 11
	gen shift = strpos(v13,"current") > 0 // one obs
	replace v12 = "Must hold or be eligible to hold a North Carolina Professional Educators  license in Birth-Kindergarten (License with Pre-K Add-on).  See <a  href=https://drive.google.com/file/d/0BzCaYVgReg7jaG1SYXBQYlgzN2s/edit?usp=shari ng'>description</a>" if shift==1 
	replace v13 = "F" if shift==1 // manual fixes because the import didn't parse
	replace v14 = "1" if shift==1
	replace v15 = "" if shift==1
	replace v16 = "State Salary Schedule" if shift==1
	replace v17 = "400" if shift==1
	replace v18 = "While serving in this role, the individual must hold a current North Carolina  Professional Educators license for his/her area of assignment and hold the  'Highly Qualified' status for any core area of assignment." if shift==1
	replace v19 = "NC" if shift==1
	replace v20 = "80" if shift==1
	replace v21 = "2014-02-17" if shift==1
	replace v22 = "500 Marcella Drive" if shift==1
	replace v23 = "10.00" if shift==1
	replace v24 = "Teacher" if shift==1
	replace v25 = "540" if shift==1
	replace v26 = "28501" if shift==1
	replace v27 = "" if shift==1
	replace v28 = "0" if shift==1
	replace v29 = "0" if shift==1
	replace v30 = "0" if shift==1
	replace v31 = "0" if shift==1
	replace v32 = "2014-01-22" if shift==1
	replace v33 = "2014-06-17" if shift==1
	replace v34 = "100.00" if shift==1
	replace v35 = ".00" if shift==1
	replace v36 = "S" if shift==1
	drop shift	
	





	
	ren v1 hrms_job_id // V
	*ren v2 // 0 or 1
	*ren v3 // A or V
	ren v4 position_available_date
	ren v5 benefits_level
	ren v6 position_type
	ren v7 vacancy_city
	ren v8 continuing_or_temporary
	ren v9 vacancy_create_date
	ren v10 minimum_degree_level
	ren v11 position_description
	ren v12 minimum_experience
	ren v13 full_time_or_part_time
	*ren v14 // 0 or 1
	ren v15 vacancy_school // or vacancy_address
	ren v16 salary_range
	ren v17 vac_school_id // site_code
	ren v18 special_conditions
	ren v19 vacancy_state
	*ren v20 // e.g., 80, 90, 99
	ren v21 vacancy_status_change_date
	ren v22 vacancy_address
	ren v23 position_term
	ren v24 position_title
	ren v25 vac_lea_id // school_district_code
	ren v26 vacancy_zip
	*ren v27 // four-digit numbers
	*ren v28 // 0 or 1
	*ren v29 // 0 or 1
	*ren v30 // 0 or 1
	*ren v31 // 0 or 1
	ren v32 vacancy_closing_date // CHECK!
	ren v33 position_end_date
	*ren v34 // numbers 0 to 100, more near 100
	*ren v35 // numbers 0 to 100, in round numbers: 20, 19.5, 19
	*ren v36 // F, H, or S
	
	* we don't have: position_number, position_comments, vac_id, vac_id2, vac_id3
	
	drop v2 v3 v14 v20 v27 v28 v29 v30 v31 v34 v35 v36
	
	* clean up city
	gen parser = strpos(vacancy_city,", NC")
	replace vacancy_city = substr(vacancy_city,1,parser-1) if parser>0
	replace parser = strpos(vacancy_city, ", N.C.")
	replace vacancy_city = substr(vacancy_city,1,parser-1) if parser>0
	drop parser
	replace vacancy_city = "" if vacancy_city=="NC"
	replace vacancy_city = "" if strpos(vacancy_city,"listed")>0
	replace vacancy_city = trim(upper(vacancy_city))
	gen length_city = length(vacancy_city)
	replace vacancy_city = substr(vacancy_city,1,length_city-1) if substr(vacancy_city,length_city,1)==","
	drop length_city
	
	replace vacancy_state = trim(upper(vacancy_state))
	
	destring position_term vac_lea_id vacancy_zip, replace
	


	* clean up variables and convert string variables to numeric
	foreach var in "position_available_date" "vacancy_create_date" "vacancy_status_change_date" "vacancy_closing_date" "position_end_date" {
		ren `var' `var'_str
		replace `var'_str = "" if `var'_str=="."
		gen yy = substr(`var'_str,1,4)
		gen mm = substr(`var'_str,6,2)
		gen dd = substr(`var'_str,9,2)
		destring yy mm dd, replace
		gen `var' = mdy(mm,dd,yy)
		assert `var'_str == "" if `var'==.
		drop `var'_str yy mm dd
		format `var' %d
	}
	

	
	if `dataCheck' == 1 {
		unique hrms_job_id // we have 68 duplicates
		unique hrms_job_id vac_school_id // we have 1 duplicates

	}
	
	sort hrms_job_id vac_lea_id vac_school_id
	
	compress
	
	save "$basedata/hrms_vacancies", replace
	
}	

********************************************************************************
* Read in HRMS applications to districts
********************************************************************************

if `districtAppsData' == 1 {
	********************************************************************************
	* Create dataset of all unique applications to districts
	* Identifier: hrms_app_id lea_cd
	********************************************************************************

	import delimited using "$rawdata/hrms/Application_Recipient.txt", clear bindquotes(loose) varnames(1)
	ren (apr_app_id apr_recepient_tp_cd apr_unit_cd) (hrms_app_id apr_recipient_tp_cd lea_cd)
	

	destring lea_cd, replace ignore("X")
	
	duplicates drop
	
	if `dataCheck' == 1 {
	
		tab apr_recipient_tp_cd // only takes on values "L", "P", "S"
	
		unique lea_cd
		unique lea_cd apr_recipient_tp_cd // LEAs have multiple recipient types
		
		unique hrms_app_id
		unique hrms_app_id lea_cd // a few applicants with multiple app types
	}
	
	gen apr_rec_tpL = apr_recipient_tp_cd=="L"
	gen apr_rec_tpP = apr_recipient_tp_cd=="P"
	gen apr_rec_tpS = apr_recipient_tp_cd=="S"
	collapse (sum) apr_rec_tp*, by(hrms_app_id lea_cd)
	
	compress
	
	save "$basedata/hrms_district_apps", replace
	
}	
	

	